---
title: "Calculate exposure measures based on birth history in the 1982 census"
output: html_notebook
---

```{r setup}
# Initial Version: Created on August 10, 2018.
# Last Checked: Verified for functionality on February 17, 2025.
# The current version is compatible with R version 4.3.0 and R Studio version 2024.12.0+467.

library(tidyverse)
library(psych)
library(haven)

# please check working directory
getwd()
#you can also mannually set the working directory
#workdir <- "....../ReplicationPackage/Rfiles"
#setwd("workdir")

# helper: mutate_type
mutate_type <- function(df, typefun, ...) {
  
  # exprs for mutation
  tvars_exprs <- 
    rlang::exprs(..., .named = TRUE) %>%
    map(~expr(UQ(typefun)(!!.))) # anonymous function
  
  df %>%
    mutate(!!!tvars_exprs)
}


# number of births to consider
nb_keep <- 6

# range of maternal age
minage <- 16
maxage <- 45

# center the tile in ggplot
theme_update(plot.title = element_text(hjust = 0.5))
```


# Data input

## load and clean the 1982 census data

```{r load census 82, cache = TRUE}
# read 1982 census
c1982hh_raw <- read_dta("../data/census1982hh.dta")
```

```{r process data}
vars_exprs <- 
  rlang::exprs( prov, hanP2, eduyP2, birthyP2, !!! syms(str_c("age", 1:nb_keep, "birthP2")) )

c1982hh <-
  c1982hh_raw %>%
  filter(nfather <= 1, nmother <= 1,
         prov != 54, # exclude Tibet
         !sampleChCoresideMoreThanBirth,
         ntwin == 0, ntriplet == 0, nquadruplet == 0, # exclude multiple birth
         birthyP2 >= 1930, birthyP2 <= 1939, # 40+ at the OCP
         !is.na(hanP2),
         as.logical(sampleChCoreside) ) %>%
  select(!!! vars_exprs) %>%
  mutate_type(as.integer, !!! vars_exprs) %>%
  mutate_type(as.logical, hanP2) %>%
  filter(hanP2 == 1)

c1982hh <- 
  c1982hh %>%
  mutate(edulevP2 = fct_collapse(factor(eduyP2),
                                  "N" = "0",
                                  "L" = "6",
                                  "M" = "9",
                                  "H" = c("12", "14", "16")
                                  )
         )

```


## fine data
```{r}
finedata <- 
  read_dta("../raw/OCP/ebenstein_fines.dta") %>%
  mutate(year = as.integer(birthyear),
         prov = as.integer(province)) %>%
  select(year, prov, fine)

finedata %>%
  filter( prov != 54) %>%
  mutate(prov = factor(prov, 
                       levels = c(11,12,13,14,15,
                                  21,22,23,
                                  31,32,33,
                                  34,35,36,37,
                                  41,42,43,44,45,46,
                                  50,51,52,53,
                                  61,62,63,64,65),
                       labels = c("Beijing", "Tianjin", "Hebei", "Shanxi", "Neimenggu",
                                  "Liaoning", "Jilin", "Heilongjiang", 
                                  "Shanghai", "Jiangsu", "Zhejiang", 
                                  "Anhui", "Fujian", "Jiangxi", "Shandong",
                                  "Henan", "Hubei", "Hunan", "Guangdong", "Guangxi", "Hainan",
                                  "Chongqing", "Sichuan", "Guizhou", "Yunnan",
                                  "Shaanxi", "Gansu", "Qinghai", "Ningxia", "Xinjiang")
                       ))  %>%
  ggplot(aes( x = year, y = fine)) + 
  geom_line() + 
  facet_wrap(~prov) +
  labs( x = "Year", y = "Fines in Times of Local Household Annual Income") +
  scale_x_continuous(breaks = seq(1980,2000,10)) + 
  theme_classic()

ggsave("FigureA1_figFineProv.pdf", path = "../output",
       width = 11, height = 8)


# fine in the latest and earliest years
finedata2000 <- finedata %>%
  filter(year == 2000) %>%
  select(prov, fine2000 = fine)

finedata1979 <- finedata %>%
  filter(year == 1979) %>%
  select(prov, fine1979 = fine)  

```

## llf
```{r}
llf <- read_dta("../raw/OCP/llf.dta")

table(llf$llf)
```

## combine llf and fine
```{r}
finellfdata <- finedata %>% 
  filter(prov != 54) %>%
  right_join(expand.grid(prov = c(11:15, 21:23, 31:37, 41:46, 50:53, 61:65),
                         year = 1945:2015),
             by = c("prov", "year")) %>% # expand year to possible years at birth
  left_join(finedata2000, by = "prov") %>%
  left_join(finedata1979, by = "prov") %>%
  mutate(fine = ifelse(year < 1979, 0,  # fill zero fine before 1979
                       ifelse(year >= 2000, fine2000, # fill fine after 2000
                              fine) )) %>%
  left_join(llf, by = "prov") %>% # join with llf
  mutate(Illf = ifelse(is.na(llf), 0, year >= llf) ) %>% # missing llf as non-exposure
  select(-fine2000, -fine1979, -llf)
```


# Construct policy exposure variables

## density of spacing, 2-6 births
```{r}
getSpacing <- function (n) {
  probvar <- str_c("prob", n, "birth") %>% sym
  spacing <- 
    c1982hh %>% 
    mutate(spacing = as.integer( !!sym(str_c("age",n, "birthP2")) 
                                 - !!sym(str_c("age",n-1,"birthP2")) 
                                 )
           ) %>%
    filter(spacing <= 10) %>%
    count(spacing) %>%
    mutate(!! probvar := n/sum(n)) %>%
    select(-n)
}

# test
(getSpacing(2))

spacingProb <-
  seq(2,nb_keep) %>%
  map(getSpacing) %>%
  reduce(full_join, by = "spacing")

# 2-6 births
spacingProb %>%
  gather(birth, prob, prob2birth:prob6birth) %>%
  mutate(birth = str_extract(birth, "\\d") %>% as.factor ) %>%
  ggplot(aes(x = spacing, y = prob, linetype = birth)) + 
  geom_line() +
  scale_x_continuous(breaks = seq(1,10)) +
  labs(x = "Years between births", 
       y = "Proportion") +
  scale_linetype_discrete(name = "Births",
                          labels = c("1st and 2nd", "2nd and 3rd", "3rd and 4th",
                                    "4th and 5th", "5th and 6th")) + 
  theme(legend.position = "bottom")

# 3rd birth
spacingProb %>%
  ggplot(aes(x = spacing, y = prob3birth)) + 
  geom_line() +
  scale_x_continuous(breaks = seq(1,10)) +
  labs(x = "Years between the 2nd and 3rd births", 
       y = "Proportion") +
  theme_classic() +
  theme(legend.position = "bottom")

ggsave("FigureA3_figSpacing3b.pdf", path = "../output",
       width = 8, height = 5)
```


## fine at each birth for prov-yearLastBirth cells
```{r}
grid <-
  expand.grid(prov = c(11:15, 21:23, 31:37, 41:46, 50:53, 61:65),
              yearLastBirthP2 = 1945:1990, 
              spacing = 1:10) %>%
  mutate(yearBirthP2 = yearLastBirthP2 + spacing) 

grid

tempdata <-
  grid %>% 
  left_join(spacingProb, by = "spacing" ) %>%
  left_join(finellfdata, by = c("prov" = "prov", "yearBirthP2" = "year"))

fineBirth <- 
  tempdata %>%
  gather(birth, prob, prob2birth:prob6birth) %>%
  mutate(birth = str_extract(birth, "\\d") %>% as.integer() ) %>%
  mutate(fine_wt = fine * prob) %>%
  group_by(prov, birth, yearLastBirthP2) %>%
  summarise(fineBirth = sum(fine_wt))

write_dta(fineBirth, "../data/fineBirth.dta", version = 13)
```

## density of age at birth for each birth
```{r}
# grid of age
gridAge <- expand.grid(
  age = minage:maxage # age at a policy
) %>% as_tibble

# get probability of age at birth for the Nth birth
getProbAge <- function(n) {
  ageNbirth <- str_c("age", n, "birthP2") %>% sym

  temp <- 
    c1982hh %>%
    filter( !!ageNbirth >= minage, !!ageNbirth <= maxage) %>%
    group_by(!!ageNbirth) %>%
    summarise(obs = n()) %>% # distribution of age at birth
    ungroup() %>%
    rename(age = !!ageNbirth) %>%
    right_join(gridAge, by = "age") %>% # complete grid
    mutate(obs = ifelse(is.na(obs), 0, obs), # fill missing to zero
           prob = obs/sum(obs)) %>%
    select(age, prob) %>%
    mutate(birth = as.integer(n))
}

#(getProbAge(1))

ageBirthProb <-
  c(1:6) %>%
  map(getProbAge) %>%
  reduce(bind_rows)

ageBirthProb %>%
  ggplot(aes(x = age, y = prob, linetype = as.factor(birth) )) +
  geom_line() +
  labs(title = 'Distributions of maternal age at birth, 1-6 birth parities',
       x = 'Maternal age at birth',
       y = 'Probability',
       caption = "The sample includes mothers born in years 1930-1939 in the 1982 China population census") +
  scale_linetype_discrete(name = 'Parity') +
  scale_x_continuous(breaks = seq(15,45,5))  

```

## caculate llf exposure at each birth for prov-birthyP2 cells
```{r}
grid <- 
  expand.grid(prov = c(11:15, 21:23, 31:37, 41:46, 50:53, 61:65),
              birthyP2 = 1931:1970, 
              birth = 1:nb_keep,
              age = minage:maxage) %>%
  arrange(prov, birthyP2, birth, age)

llfExposure <-
  grid %>% 
  left_join(ageBirthProb, by = c("birth", "age")) %>%
  mutate(year = birthyP2 + age) %>% # year of policy for a certain age
  left_join(finellfdata, by = c("prov", "year")) %>%
  group_by(prov, birthyP2, birth) %>%
  summarise(llfExp = sum(prob * Illf) ) %>%
  ungroup %>%
  arrange(prov, birth, birthyP2) %>%
  gather(typeExp, exposure, llfExp) 

# wide form for use in regressions
llfExposureWide <- 
  llfExposure %>%
  mutate(typeExpBirth = str_c(typeExp, birth, "b")) %>%
  select(-birth, -typeExp) %>%
  spread(typeExpBirth, exposure)

write_dta(llfExposureWide, "../data/llfExposureWide.dta",
          version = 13)
```


